# -*- coding: utf-8 -*-
import codecs
import sys,os
 

kb_path = 'E:\\desktop\\wu-request\\NLPCC 2014 Shared Tasks Guidelines\\Chinese Entity Linking  SAMPLE DATA NLPCC2014_EL_sample\\'
kb_file = kb_path+'PKBase_zhwiki_1_small.xml'

'''
<entity enity_id="WKB349" title="GNU自由文档许可证">
    <name>gnu自由文件授权条款</name>
    <name>gnu_free_documentation_license</name>
    <publisher>free_software_foundation,_inc.</publisher>
    <date>当前版本 ：</date>
    <date>2008年11月3日</date>
    <debianapproved>是</debianapproved>
  </entity>

  <entity enity_id="WKB1928921" title="乌石镇_(雷州市)" />
  <entity enity_id="WKB1928922" title="温泉镇_(温县)" />

'''
fid = codecs.open(kb_file, 'r', encoding='utf-8')

entity_set = []
entity = None

num = 0
line=fid.readline(1000)
while line:
    line.replace('\r\n',' ')
    line.replace('\n',' ')
    line=line.strip()
    if not line:
        line=fid.readline(1000)
        continue
    #----------------------------------------
    if line[0:7] =='<entity':
        idx1 = line.find('enity_id')
        idx2 = line.find(' ',idx1)
        enity_id = line[idx1:idx2].split('=')[1]
        
        idx1 = line.find('title')
        idx2 = line.find(' ',idx1)
        title = line[idx1:idx2].split('=')[1]
        
        entity = ((enity_id,title),[])
        
        if line[-2:]=='/>':
            num=num+1
            if (num % 1000==0):
                print 'entity_num=',num
            entity_set.append(entity)
            entity = None
    elif line[0:9] =='</entity>':
        #print 'read [%s,%s' % (entity[0][0],entity[0][1]),'] finished '
        num=num+1
        if (num % 1000==0):
            print 'entity_num=',num
        entity_set.append(entity)
        entity = None
    elif not entity is None :
        tmp = line.split('>')
        ver_nm  = tmp[0].replace('<',' ').strip()
        ver_val = tmp[1].split('<')[0]
        entity[1].append((ver_nm,ver_val))
    else:
        print 'skip:',line
    #----------------------------------------
    line=fid.readline(1000)
    
print 'write PKBase_key_title'
fod = codecs.open(kb_path+'PKBase_key_title.txt', 'w', encoding='utf-8')
for entity in entity_set:
    fod.write(entity[0][0])
    fod.write(',\t')
    fod.write(entity[0][1])
    fod.write('\n')
fod.close()

print 'write PKBase_context'
fod = codecs.open(kb_path+'PKBase_context.txt', 'w', encoding='utf-8')
for entity in entity_set:
    
    fod.write('#')
    fod.write(entity[0][0])
    fod.write(',\t')
    fod.write(entity[0][1])
    fod.write('\n')
    
    for itc in entity[1]:
        fod.write('@')
        fod.write(itc[0])
        fod.write(',\t')
        fod.write(itc[1])
        fod.write('\n')
    fod.write('\n')
fod.close()
    
    
    
    